How to use iCellR for analyzing scRNA-seq data
Download sample data
# set your working directory
setwd("/your/download/directory")
# save the URL as an object
sample.file.url = "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz"
# download the file
download.file(url = sample.file.url,
destfile = "pbmc3k_filtered_gene_bc_matrices.tar.gz",
method = "auto")
# unzip the file.
untar("pbmc3k_filtered_gene_bc_matrices.tar.gz")
Load iCellR package
library(iCellR)
Read 10x data
my.data <- load10x("filtered_gene_bc_matrices/hg19/",gene.name = "geneSymbol")
my.data <- read.table("GSM2906461_PeripheralBlood3_dge.txt",header=TRUE)
Look at the first few lines and columns of the sample data. And count the rows and columns.
head(my.data)[1:5]
## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
## MIR1302.10 0 0 0
## FAM138A 0 0 0
## OR4F5 0 0 0
## RP11.34P13.7 0 0 0
## RP11.34P13.8 0 0 0
## AL627309.1 0 0 0
## AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1
## MIR1302.10 0 0
## FAM138A 0 0
## OR4F5 0 0
## RP11.34P13.7 0 0
## RP11.34P13.8 0 0
## AL627309.1 0 0
dim(my.data)
## [1] 32738 2700
You can devide the sample into 3 equal parts. Assuming that you have three samples.
sample1 <- my.data[1:900]
sample2 <- my.data[901:1800]
sample3 <- my.data[1801:2700]
Merge samples
Conditions in iCellR are set in the header of the data and are separated by an underscore (_)
my.data <- data.aggregation(samples = c("sample1","sample2","sample3"),
condition.names = c("WT","KO","Ctrl"))
head(my.data)[1:5]
## WT_AAACATACAACCAC.1 WT_AAACATTGAGCTAC.1 WT_AAACATTGATCAGC.1
## A1BG 0 0 1
## A1BG.AS1 0 0 0
## A1CF 0 0 0
## A2M 0 0 0
## A2M.AS1 0 0 0
## A2ML1 0 0 0
## WT_AAACCGTGCTTCCG.1 WT_AAACCGTGTATGCG.1
## A1BG 0 0
## A1BG.AS1 0 0
## A1CF 0 0
## A2M 0 0
## A2M.AS1 0 0
## A2ML1 0 0
Make an iCellR object
my.obj <- make.obj(my.data)
Look at the object
my.obj
## [1] "An object of class iCellR version: 0.99.0"
## [2] "Raw/original data dimentions (rows,columns): 32738,2700"
## [3] "Data conditions in raw data: Ctrl,KO,WT (900,900,900)"
## [4] "Columns names: WT_AAACATACAACCAC.1,WT_AAACATTGAGCTAC.1,WT_AAACATTGATCAGC.1 ..."
## [5] "Row names: A1BG,A1BG.AS1,A1CF ..."
Run basic QC
my.obj <- qc.stats(my.obj,
which.data = "raw.data",
s.phase.genes = s.phase,
g2m.phase.genes = g2m.phase)
stats.plot(my.obj,
plot.type = "all.in.one",
out.name = "UMI-plot",
interactive = FALSE,
cell.color = "slategray3",
cell.size = 1,
cell.transparency = 0.5,
box.color = "red",
box.line.col = "green")

# Scatter plots
stats.plot(my.obj, plot.type = "point.mito.umi", out.name = "mito-umi-plot",interactive = F)
stats.plot(my.obj, plot.type = "point.gene.umi", out.name = "gene-umi-plot",interactive = F)

Filtering cells
iCellR allows you to filter based on library sizes (UMIs), number of genes per cell, percent mitochondrial content, one or more genes, and cell ids.
my.obj <- cell.filter(my.obj,
min.mito = 0,
max.mito = 0.05,
min.genes = 200,
max.genes = 2400,
min.umis = 0,
max.umis = Inf)
## [1] "cells with min mito ratio of 0 and max mito ratio of 0.05 were filtered."
## [1] "cells with min genes of 200 and max genes of 2400 were filtered."
## [1] "No UMI number filter"
## [1] "No cell filter by provided gene/genes"
## [1] "No cell id filter"
## [1] "filters_set.txt file has beed generated and includes the filters set for this experiment."
This step is optional and is for having the same number of cells for each condition.
# optional
# my.obj <- down.sample(my.obj)
#[1] "From"
#[1] "Data conditions: Ctrl,KO,WT (877,877,883)"
#[1] "to"
#[1] "Data conditions: Ctrl,KO,WT (877,877,877)"
Normalizing data
You have a few options to normalize your data based on your study. You can also normalize your data using tools other than iCellR and import your data to iCellR. We recommend “ranked.glsf” normalization for most single cell studies. This normalization is great for fixing matrixes with lots of zeros and because it’s geometric it is great for fixing for batch effects, as long as all the data is aggregated into one file (to aggregate your data see “aggregating data” section above).
- Choose from the following methods:
- deseq (best for bulk RNA-Seq)
- ranked.deseq (best for scRNA-Seq)
- global.glsf (best for bulk RNA-Seq)
- ranked.glsf (best for scRNA-Seq)
- rpm (best for bulk RNA-Seq)
- spike.in
For spike.in normalization you need to provide the spike.in vlaues to normalize the data with.
my.obj <- norm.data(my.obj,
norm.method = "ranked.glsf",
top.rank = 500)
Scaling data
my.obj <- data.scale(my.obj)
Calculating gene stats
my.obj <- gene.stats(my.obj, which.data = "main.data")
Make gene model for clustering
It’s best to always to avoid global clustering and use a set of model genes. In bulk RNA-seq data it is very common to cluster the samples based on top 500 genes ranked by base mean, this is to reduce the noise. In scRNA-seq data, it’s great to do so as well.
make.gene.model(my.obj,
dispersion.limit = 1.5,
base.mean.rank = 500,
no.mito.model = T,
no.cell.cycle = T,
mark.mito = T,
interactive = F,
out.name = "gene.model")
## [1] "my_model_genes.txt file is generated, which can be used for clustering."

Run PCA
my.obj <- run.pca(my.obj,
clust.method = "gene.model",
gene.list = readLines("my_model_genes.txt"),
batch.norm = F)
# Re-define model genes (run this if you have real samples)
# find.dim.genes(my.obj, dims = 1:10, top.pos = 20, top.neg = 10)
# Second round PCA and batch correction (run this if you have real sample)
#my.obj <- run.pca(my.obj,
# clust.method = "gene.model",
# gene.list = readLines("my_model_PC_genes.txt"),
# batch.norm = T)
# Visualize standard deviations for PCs
opt.pcs.plot(my.obj)

Cluster the data
Here we cluster the first 10 PC dimensions of the data. You have the option of clustering your data based on the following methods.
- Clustering methods:
- ward.D
- ward.D2
- single
- complete
- average
- mcquitty
- median
- centroid
- kmeans
- Distance methods:
- euclidean
- maximum
- manhattan
- canberra
- binary
- minkowski
- Indexing methods
- kl
- ch
- hartigan
- ccc
- scott
- marriot
- trcovw
- tracew
- friedman
- rubin
- cindex
- db
- silhouette
- duda
- pseudot2
- beale
- ratkowsky
- ball
- ptbiserial
- gap
- frey
- mcclain
- gamma
- gplus
- tau
- dunn
- hubert
- sdindex
- dindex
- sdbw
- all
my.obj <- run.clustering(my.obj,
clust.method = "kmeans",
dist.method = "euclidean",
index.method = "silhouette",
max.clust = 25,
min.clust = 2,
dims = 1:10)
Dimentionality reduction
# tSNE
my.obj <- run.pc.tsne(my.obj, dims = 1:10)
# UMAP
my.obj <- run.umap(my.obj, dims = 1:10, method = "umap-learn")
# Diffusion map
my.obj <- run.diffusion.map(my.obj, dims = 1:10)
Create avarge expression per cluster
my.obj <- clust.avg.exp(my.obj)
Data imputaion
my.obj <- run.impute(my.obj)
Save object
save(my.obj, file = "my.obj.Robj")
Visualizing clusters and conditions
# clusters (tSNE)
cluster.plot(my.obj,
plot.type = "tsne",
interactive = F)
# Conditions (tSNE)
cluster.plot(my.obj,
plot.type = "tsne",
col.by = "conditions",
interactive = F)

# clusters (UMAP)
cluster.plot(my.obj,
plot.type = "umap",
interactive = F)
# Conditions (UMAP)
cluster.plot(my.obj,
plot.type = "umap",
col.by = "conditions",
interactive = F)

# clusters (diffusion)
cluster.plot(my.obj,
plot.type = "diffusion",
interactive = F)
# Conditions (diffusion)
cluster.plot(my.obj,
plot.type = "diffusion",
col.by = "conditions",
interactive = F)

Cell frequencies in clusters and conditions
# bar
clust.cond.info(my.obj, plot.type = "bar", normalize = T)
# pie
clust.cond.info(my.obj, plot.type = "pie", normalize = T)
## [1] "clust_cond_freq_info.txt file has beed generated."
## [1] "clust_cond_freq_info.txt file has beed generated."

Find marker genes for clusters
marker.genes <- findMarkers(my.obj,
fold.change = 2,
padjval = 0.1)
head(marker.genes)
## baseMean baseSD AvExpInCluster AvExpInOtherClusters
## GZMK 0.41991651 1.6966460 2.4363833 0.11537810
## LAG3 0.04546080 0.2827279 0.2468928 0.01503939
## CD8A 0.21021541 0.7086838 1.0163009 0.08847574
## GZMH 0.47430404 1.9741941 2.2865498 0.20060827
## CCL5 2.62409548 6.5488179 12.3863881 1.14973789
## RP11.222K16.2 0.02626753 0.2084994 0.1087713 0.01380733
## foldChange log2FoldChange pval padj clusters
## GZMK 21.116513 4.400300 7.120249e-27 1.478164e-23 1
## LAG3 16.416407 4.037066 1.168625e-10 2.410873e-07 1
## CD8A 11.486775 3.521902 8.226752e-28 1.708696e-24 1
## GZMH 11.398084 3.510719 1.184154e-19 2.452383e-16 1
## CCL5 10.773228 3.429379 2.251589e-72 4.687808e-69 1
## RP11.222K16.2 7.877796 2.977792 1.436435e-05 2.920273e-02 1
## gene cluster_1 cluster_2 cluster_3 cluster_4
## GZMK GZMK 2.4363833 0.05291050 0.00000 0.075901225
## LAG3 LAG3 0.2468928 0.00000000 0.00000 0.007326283
## CD8A CD8A 1.0163009 0.03154194 0.00000 0.029272097
## GZMH GZMH 2.2865498 0.02629519 0.00000 0.034177635
## CCL5 CCL5 12.3863881 0.21033406 33.79822 0.250237643
## RP11.222K16.2 RP11.222K16.2 0.1087713 0.00000000 0.00000 0.000000000
## cluster_5 cluster_6 cluster_7
## GZMK 0.40592901 0.127154863 0.027465519
## LAG3 0.01062049 0.026494853 0.000000000
## CD8A 0.11671291 0.141884155 0.017431855
## GZMH 2.63214509 0.016212854 0.027473883
## CCL5 8.84515165 0.602894477 0.122993436
## RP11.222K16.2 0.13921096 0.008496299 0.003296331
Visualizing gene expressions
MyGenes <- unique(top.markers(marker.genes, topde = 10, min.base.mean = 0.2))
MyGenes
## [1] "GZMK" "CD8A" "GZMH" "CCL5" "KLRG1"
## [6] "CD8B" "LYAR" "CST7" "GZMA" "NKG7"
## [11] "IGLL5" "LINC00926" "CD79A" "TCL1A" "MS4A1"
## [16] "CD79B" "HLA.DQB1" "HLA.DQA1" "HLA.DQA2" "HVCN1"
## [21] "PPBP" "PF4" "GPX1" "TAGLN2" "CALM3"
## [26] "OAZ1" "ACTB" "MYL6" "ITM2B" "S100A8"
## [31] "S100A9" "LGALS2" "CD14" "MS4A6A" "LYZ"
## [36] "FCN1" "GRN" "CDA" "BLVRB" "GNLY"
## [41] "GZMB" "SPON2" "FGFBP2" "PRF1" "CCL4"
## [46] "CD247" "LEF1" "AQP3" "CCR7" "PRKCQ.AS1"
## [51] "TRAT1" "FLT3LG" "PIK3IP1" "IL7R" "LDHB"
## [56] "TCF7" "MS4A7" "FCGR3A" "IFITM3" "LST1"
## [61] "HCK" "RHOC" "SERPINA1" "IFI30" "AIF1"
## [66] "FCER1G"
genelist = c("PPBP","LYZ","MS4A1","GNLY","LTB","NKG7","IFITM2","CD14","S100A9")
for(i in genelist){
MyPlot <- gene.plot(my.obj, gene = i,
interactive = F,
plot.data.type = "umap",
cell.transparency = 1)
i <- gsub("-",".",i)
eval(call("<-", as.name(i), MyPlot))
}
##
library(gridExtra)
grid.arrange(PPBP,LYZ,MS4A1,GNLY,LTB,NKG7,IFITM2,CD14,S100A9)

genelist = c("PPBP","LYZ","MS4A1","GNLY","LTB","NKG7","IFITM2","CD14","S100A9")
for(i in genelist){
MyPlot <- gene.plot(my.obj, gene = i,
interactive = F,
plot.data.type = "tsne",
cell.transparency = 1)
i <- gsub("-",".",i)
eval(call("<-", as.name(i), MyPlot))
}
##
library(gridExtra)
grid.arrange(PPBP,LYZ,MS4A1,GNLY,LTB,NKG7,IFITM2,CD14,S100A9)

QC on clusters
clust.stats.plot(my.obj, plot.type = "box.mito", interactive = F)
clust.stats.plot(my.obj, plot.type = "box.gene", interactive = F)

Cell type prediction using ImmGen
## get marker genes for cluster 2
Cluster = 2
MyGenes <- unique(top.markers(marker.genes, topde = 40, min.base.mean = 0.2, cluster = Cluster))
MyGenes
## [1] "IGLL5" "LINC00926" "CD79A" "TCL1A" "MS4A1"
## [6] "CD79B" "HLA.DQB1" "HLA.DQA1" "HLA.DQA2" "HVCN1"
## [11] "CD74" "HLA.DMB" "HLA.DRA" "HLA.DMA" "HLA.DPB1"
## [16] "HLA.DRB1" "HLA.DRB5" "HLA.DPA1" "SNX2" "CD37"
## [21] "LY86" "NCF1" "FAIM3" "SNHG7" "ISG20"
## [26] "FAM26F" "CXCR4" "SEC62" "RCSD1" "TMEM243"
## [31] "POLD4" "LAPTM5" "DCK" "EZR" "PAPOLA"
## [36] "POU2F2" "RNASEH2B"
# ImmGen dot plot
cell.type.pred(immgen.data = "rna", gene = MyGenes, plot.type = "point.plot")
cell.type.pred(immgen.data = "uli.rna", gene = MyGenes, plot.type = "point.plot", top.cell.types = 50)
As seen in the dot plots the B cells are in the top, suggesting that cluster 2 is B cells.

# ImmGen dot plot
cell.type.pred(immgen.data = "rna", gene = MyGenes, plot.type = "heatmap")
cell.type.pred(immgen.data = "uli.rna", gene = MyGenes, plot.type = "heatmap")
As seen in the heatmaps the B cells are more expressed for these genes.


# ImmGen dot plot
cell.type.pred(immgen.data = "mca", gene = MyGenes, plot.type = "point.plot")
cell.type.pred(immgen.data = "mca", gene = MyGenes, plot.type = "heatmap")


Differential Expression Analysis
# Between clusters
diff.res <- diff.exp(my.obj, de.by = "clusters", cond.1 = c(1,4), cond.2 = c(2))
diff.res1 <- as.data.frame(diff.res)
diff.res1 <- subset(diff.res1, padj < 0.05)
head(diff.res1)
## baseMean 1_4 2 foldChange
## ABI3 0.21809887 2.839549e-01 0.054036489 0.19029955
## AC006369.2 0.02330890 3.266529e-02 0.000000000 0.00000000
## AC079767.4 0.05360692 9.323288e-04 0.184831350 198.24696396
## AC092580.4 0.06657490 9.184304e-02 0.003626185 0.03948241
## AC093673.5 0.04228509 5.724388e-02 0.005019326 0.08768318
## ACTB 17.26037308 2.050377e+01 9.180343474 0.44773940
## log2FoldChange pval padj
## ABI3 -2.393656 9.582075e-10 1.381160e-05
## AC006369.2 -Inf 2.449500e-06 3.457715e-02
## AC079767.4 7.631155 1.004322e-09 1.447328e-05
## AC092580.4 -4.662646 1.290914e-10 1.868082e-06
## AC093673.5 -3.511556 1.230512e-07 1.755818e-03
## ACTB -1.159269 4.796900e-94 7.126275e-90
## Between conditions
# diff.res <- diff.exp(my.obj, de.by = "conditions", cond.1 = c("WT"), cond.2 = c("KO"))
## Betwen conditions in the same cluster
# diff.res <- diff.exp(my.obj, de.by = "clustBase.condComp", cond.1 = c("WT"), cond.2 = c("KO"), base.cond = 1)
## Between clusters in the same condition
# diff.res <- diff.exp(my.obj, de.by = "condBase.clustComp", cond.1 = c(1), cond.2 = c(2), base.cond = "WT")
Volcano and MA plots
# Volcano Plot
volcano.ma.plot(diff.res,
sig.value = "pval",
sig.line = 0.05,
plot.type = "volcano",
interactive = F)
# MA Plot
volcano.ma.plot(diff.res,
sig.value = "pval",
sig.line = 0.05,
plot.type = "ma",
interactive = F)

Data manipulation
# let's say you want to merge cluster 3 and 2.
my.obj <- change.clust(my.obj, change.clust = 3, to.clust = 2)
# to reset to the original clusters run this.
my.obj <- change.clust(my.obj, clust.reset = T)
# you can also re-name the cluster numbers to cell types. Remember to reset after this so you can ran other analysis.
my.obj <- change.clust(my.obj, change.clust = 7, to.clust = "B Cell")
# Let's say for what ever reason you want to remove acluster, to do so run this.
my.obj <- clust.rm(my.obj, clust.to.rm = 1)
Cell gating
# first make a plot to gate
my.plot <- gene.plot(my.obj, gene = "GNLY",
plot.type = "scatterplot",
clust.dim = 2,
interactive = F)
# gate and download cell ids.
cell.gating(my.obj, my.plot = my.plot)
# once the cell ids are download (cellGating.txt), you can assign a cluster number to them.
my.obj <- gate.to.clust(my.obj, my.gate = "cellGating.txt", to.clust = 10)


Pseudotime analysis
MyGenes <- unique(top.markers(marker.genes, topde = 10, min.base.mean = 0.2))
#
pseudotime.tree(my.obj,
marker.genes = MyGenes,
type = "unrooted",
clust.method = "complete")
#
pseudotime.tree(my.obj,
marker.genes = MyGenes,
type = "jitter",
clust.method = "complete")


Cell cycle prediction
# load data
my.data <- read.table(file = "nestorawa_forcellcycle_expressionMatrix.txt", header = TRUE, as.is = TRUE, row.names = 1)
# make iCellR object
my.obj <- make.obj(my.data)
# QC
my.obj <- qc.stats(my.obj,
s.phase.genes = s.phase,
g2m.phase.genes = g2m.phase)
# Cell cycle prediction
my.obj <- cc(my.obj, s.genes = s.phase, g2m.genes = g2m.phase)
# see first few lines
head(my.obj@stats)
## CellIds nGenes UMIs mito.percent S.phase.probability
## Prog_013 Prog_013 10211 2563089 0.05187803 0.005977553
## Prog_019 Prog_019 9991 3030620 0.05965875 0.004428137
## Prog_031 Prog_031 10192 1293487 0.05664456 0.001705467
## Prog_037 Prog_037 9599 1357987 0.07026061 0.002280581
## Prog_008 Prog_008 10540 4079891 0.06460295 0.015765372
## Prog_014 Prog_014 10788 2569783 0.07667107 0.006875289
## g2m.phase.probability S.Score G2M.Score Phase
## Prog_013 0.001254346 -112.73408 -443.15590 G1
## Prog_019 0.016710112 -284.22544 339.95043 G2M
## Prog_031 0.001356024 -185.67488 -214.18280 G1
## Prog_037 0.019837451 -192.82171 231.04795 G2M
## Prog_008 0.009221325 717.09773 -132.27275 S
## Prog_014 0.012639978 -92.44699 81.35987 G2M
# filter
my.obj <- cell.filter(my.obj)
## [1] "No mito filter"
## [1] "No gene number filter"
## [1] "No UMI number filter"
## [1] "No cell filter by provided gene/genes"
## [1] "No cell id filter"
## [1] "filters_set.txt file has beed generated and includes the filters set for this experiment."
# Normalize
my.obj <- norm.data(my.obj, norm.method = "ranked.glsf", top.rank = 500)
# Scale
my.obj <- data.scale(my.obj)
# Run PCA
my.obj <- run.pca(my.obj, clust.method = "gene.model", gene.list = c(s.phase,g2m.phase))
# Run tSNE
my.obj <- run.pc.tsne(my.obj, dims = 1:10)
# Run UMAP
my.obj <- run.umap(my.obj, dims = 1:10, method = "umap-learn")
# plot CC
pie(table(my.obj@stats$Phase))

# Plot tSNE
cluster.plot(my.obj,
plot.type = "tsne",
col.by = "cc",
clust.dim = 2,
interactive = F)
